import pandas as pd
import numpy as np
import os
os.getcwd()
os.listdir('data')
all_data = pd.read_csv('data/train.csv')
all_data.info()
# Import pandas_profiling - an useful library for EDA
from pandas_profiling import ProfileReport
# Generate profiling for all features
profile = ProfileReport(all_data, title='Profiling Report', explorative = True)
profile
# Since the field `enrollee_id` is 100% distinct and have no missing, it can be used as index
all_data.set_index('Id', inplace = True)
cat_features = []
num_features = []
data_types = all_data.drop('Risk_Flag', axis = 1).dtypes
for feature in data_types.index:
if data_types[feature] == 'object':
cat_features.append(feature)
else:
num_features.append(feature)
cat_features
num_features
import matplotlib.pyplot as plt
eda_data = {
'defaulted': all_data[all_data['Risk_Flag'] == 1],
'not_defaulted': all_data[all_data['Risk_Flag'] == 0]
}
weights = dict()
for class_ in eda_data:
weights[class_] = len(eda_data[class_].index)
weights[class_] = np.ones(weights[class_]) / weights[class_]
for col in num_features:
fig, ax = plt.subplots(1, 2)
idx = 0
for class_ in eda_data:
eda_data[class_][col].hist(ax = ax[idx], weights = weights[class_])
ax[idx].set_title(class_)
idx += 1
fig.suptitle(col)
plt.tight_layout()
plt.show()
Striking point(s):
not_defaulted users' Experience has a higher percentage of worker with 20 years experiencefor col in cat_features:
fig, ax = plt.subplots(1, 2)
idx = 0
for class_ in eda_data:
eda_data[class_][col].value_counts(normalize = True)[:10].plot.bar(ax = ax[idx])
ax[idx].set_title(class_)
idx += 1
fig.suptitle(col)
plt.tight_layout()
plt.show()
Striking point(s):
Profession, STATE and CITY of the two classes are very differentfrom catboost import CatBoostClassifier
class CustomizedCatBoostClassifier(CatBoostClassifier):
def fit(self, X, y, *args, **kwargs):
X_train, X_eval, y_train, y_eval = train_test_split(X,
y,
test_size = 0.3)
super().fit(
X_train,
y_train,
eval_set = (X_eval, y_eval),
*args,
**kwargs
)
return self
Since some features has very high cardinality (e.g. CITY has more than 300 values), a CardinalityHandler which restricts the cardinality is useful to speed the training up
from sklearn import set_config
set_config(display='diagram')
from sklearn.base import TransformerMixin, BaseEstimator
class CardinalityHandler(BaseEstimator, TransformerMixin):
def __init__(self, columns, cardinality):
if len(columns) != len(cardinality):
raise Exception("The len of the list of columns does not match the len of the cardinality")
self.columns = columns
self.cardinality = cardinality
self.top_values = dict()
def fit(self, X, y = None):
for i in range(len(self.columns)):
self.top_values[self.columns[i]] = list(X[self.columns[i]].value_counts().index)[:self.cardinality[i]]
if 'unk' not in self.top_values[self.columns[i]]:
self.top_values[self.columns[i]].append('unk')
if 'capped_value' not in self.top_values[self.columns[i]]:
self.top_values[self.columns[i]].append('capped_value')
return self
def transform(self, X, y = None, copy = False):
if copy:
tranform_data = X.copy()
else:
transform_data = X
for col, top_val in self.top_values.items():
transform_data.loc[:, col] = transform_data[col].apply(lambda x: x if x in top_val else 'capped_value')
return transform_data
from sklearn.impute import SimpleImputer
class CustomizedSimpleImputer(SimpleImputer):
def transform(self, X):
data = super().transform(X)
return pd.DataFrame(data, columns = X.columns, index = X.index)
from sklearn.compose import ColumnTransformer
class CustomizedColumnTransformer(ColumnTransformer):
def _hstack(self, Xs):
return pd.concat(Xs, axis = 1)
from sklearn.pipeline import FeatureUnion, Pipeline
class CustomizedFeatureUninon(FeatureUnion):
def _hstack(self, Xs):
cols = [X.columns.tolist() for X in Xs]
cols = np.hstack(cols)
return pd.DataFrame(super()._hstack(Xs), columns = cols)
cat_imputer = CustomizedSimpleImputer(
missing_values = np.nan,
strategy = 'constant',
fill_value = 'unk', # short for unknown
)
columns = ['CITY']
cardinality = [100]
cardinality_handler = CardinalityHandler(columns, cardinality)
cat_handler = Pipeline(steps = [
('imputer', cat_imputer),
('cardinalityHandler', cardinality_handler)
],
verbose = False)
num_imputer = CustomizedSimpleImputer(
missing_values = np.nan,
strategy = 'most_frequent',
)
preprocessor = CustomizedColumnTransformer([
('catHandler', cat_handler, cat_features),
('numHandler', num_imputer, num_features)
],
n_jobs = -1)
fu = CustomizedFeatureUninon([
('featureUnion', preprocessor)
])
pl = Pipeline(steps = [
('preprocessor', fu),
('estimator', CustomizedCatBoostClassifier(cat_features = cat_features,
use_best_model = True,
verbose = True,
metric_period = 100,
early_stopping_rounds = 25,
auto_class_weights = 'Balanced'
))
],
verbose = True)
pl
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(all_data, test_size = 0.2)
train_data.head()
label = train_data['Risk_Flag'].copy()
train_data = train_data.drop('Risk_Flag', axis = 1)
test_label = test_data['Risk_Flag'].copy()
test_data = test_data.drop('Risk_Flag', axis = 1)
train_data.shape
label.value_counts(normalize = True)
test_data.shape
test_label.value_counts(normalize = True)
The data is extremely unbalanced, therefore, oversampling is necessary
cat_col_loc = []
for col in cat_features:
cat_col_loc.append(train_data.columns.get_loc(col))
from imblearn.over_sampling import SMOTENC
X_resampled, y_resampled = SMOTENC(categorical_features = cat_col_loc).fit_resample(train_data, label)
pl.fit(X_resampled, y_resampled)
y_pred = pl.predict(test_data)
from sklearn.metrics import classification_report, roc_auc_score, plot_confusion_matrix
print(classification_report(test_label, y_pred))
roc_auc_score(test_label, y_pred)
from sklearn.metrics import roc_auc_score, roc_curve, auc
from itertools import cycle
def auc_report(model, X, y_true, fig, ax):
classes = model.classes_
y_pred_classes = model.predict_proba(X)
y_pred = model.predict(X)
n_classes = len(classes)
lw = 2
for i in range(len(classes)):
print(f"""{classes[i]}: {roc_auc_score(y_true=(y_true==classes[i]).astype(int), y_score=y_pred_classes[:,i])}""")
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
fpr[i], tpr[i], _ = roc_curve(y_true=(y_true==classes[i]).astype(int), y_score=y_pred_classes[:,i])
roc_auc[i] = auc(fpr[i], tpr[i])
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(len(classes))]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(len(classes)):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
ax.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(classes[i], roc_auc[i]))
ax.plot([0, 1], [0, 1], 'k--', lw=lw)
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
fig.suptitle('Some extension of Receiver operating characteristic to multi-class')
ax.legend(loc="lower right")
# plt.show()
output = {
'macro_auc': roc_auc["macro"],
# f'auc_{classes[0]}': roc_auc[0],
# f'auc_{classes[1]}': roc_auc[1]
}
for i in range(len(classes)):
output[f'auc_{classes[i]}'] = roc_auc[i]
return output
fig, ax = plt.subplots()
# y_pred_classes = pl.predict_proba(test_data.drop('Risk_Flag', axis = 1))
auc_report(pl, test_data, test_label, fig, ax)
IC = type('IdentityClassifier', (), {"predict": lambda i : i, "_estimator_type": "classifier"})
plot_confusion_matrix(IC,
y_pred,
test_label,
normalize='true',
values_format='.2%',
cmap = 'Blues'
);
fi_df = pd.DataFrame({
'feature': train_data.columns,
'feature_importance':pl.steps[-1][1].get_feature_importance()
})
fi_df.set_index('feature').sort_values(by = 'feature_importance').plot.barh()
import shap
X = pl.steps[0][1].fit_transform(test_data)
shap.initjs()
explainer = shap.TreeExplainer(pl.steps[-1][1])
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values,
X.to_numpy(),
feature_names = X.columns
)
Note: The points of categorical features in the graph does not have any color because those features does not have a hierachy (low, high)
X = pl.steps[0][1].fit_transform(test_data.loc[test_label[test_label == 0].index])
shap.initjs()
explainer = shap.TreeExplainer(pl.steps[-1][1])
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values,
X.to_numpy(),
feature_names = X.columns,
# plot_type = 'bar'
)
X = pl.steps[0][1].fit_transform(test_data.loc[test_label[test_label == 1].index])
shap.initjs()
explainer = shap.TreeExplainer(pl.steps[-1][1])
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values,
X.to_numpy(),
feature_names = X.columns,
# plot_type = 'bar'
)
ea_df = pd.DataFrame({
'y_pred': y_pred,
'y_true': test_label
})
ea_df['true'] = ea_df['y_pred'] == ea_df['y_true']
ea_df = pd.concat([ea_df, test_data], axis = 1)
Defaulted¶ea_df[(ea_df['true'] == False) & (ea_df['y_true'] == 1)]['Profession'].value_counts()[:10].plot.barh()
ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 1)]['Married/Single'].value_counts()[:10].plot.bar()
ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 1)]['CURRENT_JOB_YRS'].hist()
ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 1)]['Experience'].hist()
Not Defaulted¶ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 0)]['Profession'].value_counts()[:10].plot.barh()
ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 0)]['Married/Single'].value_counts()[:10].plot.bar()
ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 0)]['CURRENT_JOB_YRS'].hist()
ea_df[(ea_df['true'] == False) & (ea_df['y_pred'] == 0)]['Experience'].hist()
Striking point(s):
Profession are not the popular one of neither classesdefaulted class is low, but it is acceptable because the number of this class is very small compared to the others. Therefore, a small percentage of wrongly classified users in the not defaulted class can lead to a low precision in defaultedpredict_data = pd.read_csv('data/predict_data.csv')
predict_data.head()
predict_data.set_index('ID', inplace = True)
prediction = pl.predict(predict_data)
prediction
predict_data['prediction'] = prediction
predict_data[['prediction']].to_csv('data/prediction.csv')